{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ABSA data processing\n",
    "This notebook pre-processes the xml format data files into pkl file that we will be using for the experiments. \n",
    "\n",
    "Before following the steps in this notebook, you have to 1) download the data and 2) rename it into a specific format.\n",
    "\n",
    "1. Go to http://alt.qcri.org/semeval2016/task5/index.php?id=data-and-tools and download training and test data of Arabic (ar), Chinese (zh), English (en), Russian (ru), Spanish (es). You can select subtask 1 for all languages, and you will have to sign up to get access to the data. For Chinese, we used Mobile Phone reviews, but you can use the digital camera one too. \n",
    "2. Rename all downloaded data to the format of `{two_character_lang_code}_{train/test}.xml`. For example, the Chinese training file should be renamed to `zh_train.xml`, and the Russian test file should be `ru_test.xml`.\n",
    "3. make new directory called `data/` and move all files under the directory.\n",
    "4. Execute this notebook and you are good to go! -- it will create `absa.pkl` under the same `data/` directory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-13T02:19:08.400102Z",
     "start_time": "2020-05-13T02:19:08.396349Z"
    }
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "import xml.etree.ElementTree as ET"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-13T01:27:58.595788Z",
     "start_time": "2020-05-13T01:27:58.585973Z"
    }
   },
   "outputs": [],
   "source": [
    "def aggregate_labels(opinions:list) -> \"str\":\n",
    "    labels, categories = zip(*opinions)\n",
    "    if len(set(labels)) == 1:\n",
    "        return \"pos\" if labels[0] == \"positive\" else \"neg\"\n",
    "    else:\n",
    "        num_pos, num_neg = labels.count(\"positive\"), labels.count(\"negative\")\n",
    "        if num_pos > num_neg:\n",
    "            return \"pos\"\n",
    "        elif num_neg < num_pos:\n",
    "            return \"neg\"\n",
    "        else:\n",
    "            return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-13T02:50:01.022012Z",
     "start_time": "2020-05-13T02:50:00.615593Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "en-train: 100/1708\n",
      "en-test: 32/587\n",
      "es-train: 91/1626\n",
      "es-test: 27/677\n",
      "ru-train: 70/2733\n",
      "ru-test: 43/908\n",
      "ar-train: 352/4790\n",
      "ar-test: 80/1225\n",
      "zh-train: 0/1333\n",
      "zh-test: 0/529\n"
     ]
    }
   ],
   "source": [
    "data = {}\n",
    "for lang in [\"en\",\"es\",\"ru\",\"ar\",\"zh\"]:\n",
    "    data[lang] = {}\n",
    "    for mode in [\"train\",\"test\"]:\n",
    "        tree = ET.parse(f'data/{mode}_{lang}.xml')\n",
    "        root = tree.getroot()\n",
    "        xs, ys = [], []\n",
    "        for child in root:\n",
    "            for sentences in child:\n",
    "                for sent in sentences:\n",
    "                    if sent.find(\"Opinions\"):\n",
    "                        opinions = [(opinion.attrib[\"polarity\"], opinion.attrib[\"category\"]) for opinion in sent.find(\"Opinions\")]\n",
    "                        if len(opinions) > 0:\n",
    "                            xs.append(sent.find(\"text\").text)\n",
    "                            ys.append(aggregate_labels(opinions))\n",
    "        len_old = len(ys)\n",
    "        xs, ys = zip(*[(x,y) for x,y in zip(xs, ys) if y!=None])\n",
    "        print(f\"{lang}-{mode}: {len_old-len(ys)}/{len_old}\")\n",
    "        assert len(xs) == len(ys)\n",
    "        data[lang][mode] = list(zip(xs,ys))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-13T02:20:34.951384Z",
     "start_time": "2020-05-13T02:20:34.911945Z"
    }
   },
   "outputs": [],
   "source": [
    "with open(\"data/absa.pkl\",\"wb\") as file:\n",
    "    pickle.dump(data, file)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:etc]",
   "language": "python",
   "name": "conda-env-etc-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "nbTranslate": {
   "displayLangs": [
    "*"
   ],
   "hotkey": "alt-t",
   "langInMainMenu": true,
   "sourceLang": "en",
   "targetLang": "fr",
   "useGoogleTranslate": true
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
